1 Setup

suppressPackageStartupMessages({
  library(DESeq2)
  library(dplyr)
  library(gplots)
  library(ggplot2)
  library(here)
  library(hyperSpec)
  library(limma)
  library(parallel)
  library(plotly)
  library(tibble)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
hpal <- colorRampPalette(c("blue","white","red"))(100)
load(here("analysis/salmon/ZE-29Seed-dds.rda"))
levels(dds.29z$Experiment) <- c("ZE","Seed")

2 Normalise

vsd <- varianceStabilizingTransformation(dds.29z,blind=FALSE)

3 Batch effect

3.1 Estimation

mat <- assay(vsd)

We select FMG and ZE from mature seeds (29Seed), that correspond to Stage B8 and B9 of the ZE

sel <- dds.29z$Experiment == "Seed" | dds.29z$Time %in% c("B8","B9")

mat <- mat[,sel]

batch = dds.29z$Experiment[sel]

contrasts(batch) <- contr.sum(levels(batch))

design <- model.matrix(~batch)

fit <- lmFit(mat, design)

beta is the actual batch correction for each gene

beta <- fit$coefficients[, -1, drop = FALSE]
beta[is.na(beta)] <- 0

cross-validation with the removeBatchEffect function

stopifnot(all((mat - beta %*% t(design[,-1,drop=FALSE]))[1:6,1:6] == 
                removeBatchEffect(mat,dds.29z$Experiment[sel])[1:6,1:6]))

3.2 Correction

load(here("analysis/salmon/ZE-SE-dds.rda"))
levels(dds.sz$Experiment) <- c("ZE","SE")
vsd <-varianceStabilizingTransformation(dds.sz,blind=FALSE)

fullbatch <- vsd$Experiment
contrasts(fullbatch) <- contr.sum(levels(fullbatch))
fullbatch <- model.matrix(~fullbatch)[, -1, drop = FALSE]

assay(vsd) <- assay(vsd) - beta %*% t(fullbatch)

4 Quality Assessment

4.1 PCA

pc <- prcomp(t(assay(vsd)))
percent <- round(summary(pc)$importance[2,]*100)
  • Cumulative components effect

We define the number of variable of the model

nvar=2

An the number of possible combinations

nlevel=nlevels(vsd$Tissue) * nlevels(vsd$Time)

We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.

ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)

4.1.1 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    as.data.frame(colData(vsd)))

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue,text=NGI.ID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

4.1.2 Heatmap

Filter for noise - the filtering is meant to restrict the number of genes to be visualised to a reasonable amount that can be viewed in a limited amount of time

conds <- factor(paste(vsd$Time,vsd$Tissue))
sels <- rangeFeatureSelect(counts=assay(vsd),
                           conditions=conds,
                           nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot

vst.cutoff <- 6
  • Heatmap (>20k genes)
hm <- heatmap.2(t(scale(t(assay(vsd)[sels[[vst.cutoff+1]],]))),
                distfun=pearson.dist,
                hclustfun=function(X){hclust(X,method="ward.D2")},
                labRow = NA,trace = "none",
                labCol = conds,
                col=hpal)

plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=conds,cex=0.8)

5 Export

dir.create(here("analysis/batchCorrection"),showWarnings=FALSE)
save(vsd,file=here("analysis/batchCorrection/vsd.rda"))

6 Example limma design

look https://www.bioconductor.org/packages//2.11/bioc/vignettes/limma/inst/doc/usersguide.pdf e.g. chapter 8.5.4 select the data to drop Stage B0, B9 and B10 (insufficient / lack of replication)

vsd <- vsd[,!vsd$Time %in% c("B0","B9","B10")]
vsd$Time <- droplevels(vsd$Time)
Stage<-vsd$Time
Tissue<-vsd$Tissue
design <- model.matrix(~Stage*Tissue)
fit <- lmFit(assay(vsd), design)
fit <- eBayes(fit)
## Warning: Zero sample variances detected, have been offset away from zero
colnames(design)
##  [1] "(Intercept)"      "StageB2"          "StageB3"          "StageB4"         
##  [5] "StageB5"          "StageB6"          "StageB7"          "StageB8"         
##  [9] "TissueSE"         "TissueZE"         "StageB2:TissueSE" "StageB3:TissueSE"
## [13] "StageB4:TissueSE" "StageB5:TissueSE" "StageB6:TissueSE" "StageB7:TissueSE"
## [17] "StageB8:TissueSE" "StageB2:TissueZE" "StageB3:TissueZE" "StageB4:TissueZE"
## [21] "StageB5:TissueZE" "StageB6:TissueZE" "StageB7:TissueZE" "StageB8:TissueZE"
topTable(fit, coef=10-9)
##                       logFC  AveExpr        t       P.Value     adj.P.Val
## MA_9485570g0010.1  4.502863 2.882118 2001.746 5.729726e-141 3.785573e-136
## MA_9033029g0010.1  4.034120 2.737445 1793.367 3.188583e-138 1.053332e-133
## MA_6316973g0010.1  3.928945 2.704983 1746.611 1.456763e-137 3.208230e-133
## MA_10171234g0010.1 3.740690 2.646880 1662.922 2.453076e-136 4.051807e-132
## MA_193120g0010.1   3.486334 2.568375 1549.848 1.407602e-134 1.859977e-130
## MA_10429987g0010.1 2.645023 2.308711 1175.844 1.111578e-127 1.224014e-123
## MA_118237g0010.1   2.559778 2.282401 1137.948 7.313236e-127 6.902546e-123
## MA_9902954g0010.1  2.490337 2.260968 1107.078 3.556181e-126 2.936917e-122
## MA_74695g0010.1    2.453448 2.249583 1090.679 8.388890e-126 6.158284e-122
## MA_150666g0020.1   2.441221 2.245809 1085.244 1.118093e-125 7.387127e-122
##                           B
## MA_9485570g0010.1  279.9882
## MA_9033029g0010.1  277.9945
## MA_6316973g0010.1  277.4694
## MA_10171234g0010.1 276.4441
## MA_193120g0010.1   274.8589
## MA_10429987g0010.1 267.2728
## MA_118237g0010.1   266.2256
## MA_9902954g0010.1  265.3226
## MA_74695g0010.1    264.8234
## MA_150666g0020.1   264.6549

7 SessionInfo

## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] tibble_3.0.4                plotly_4.9.2.1             
##  [3] limma_3.46.0                hyperSpec_0.99-20201127    
##  [5] xml2_1.3.2                  lattice_0.20-41            
##  [7] here_1.0.0                  ggplot2_3.3.2              
##  [9] gplots_3.1.1                dplyr_1.0.2                
## [11] DESeq2_1.30.0               SummarizedExperiment_1.20.0
## [13] Biobase_2.50.0              MatrixGenerics_1.2.0       
## [15] matrixStats_0.57.0          GenomicRanges_1.42.0       
## [17] GenomeInfoDb_1.26.1         IRanges_2.24.0             
## [19] S4Vectors_0.28.0            BiocGenerics_0.36.0        
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6           bit64_4.0.5            RColorBrewer_1.1-2    
##  [4] httr_1.4.2             rprojroot_2.0.2        tools_4.0.3           
##  [7] R6_2.5.0               KernSmooth_2.23-18     DBI_1.1.0             
## [10] lazyeval_0.2.2         colorspace_2.0-0       withr_2.3.0           
## [13] tidyselect_1.1.0       bit_4.0.4              compiler_4.0.3        
## [16] Cairo_1.5-12.2         DelayedArray_0.16.0    labeling_0.4.2        
## [19] caTools_1.18.0         scales_1.1.1           genefilter_1.72.0     
## [22] stringr_1.4.0          digest_0.6.27          rmarkdown_2.5         
## [25] XVector_0.30.0         jpeg_0.1-8.1           pkgconfig_2.0.3       
## [28] htmltools_0.5.0        highr_0.8              htmlwidgets_1.5.2     
## [31] rlang_0.4.9            RSQLite_2.2.1          generics_0.1.0        
## [34] farver_2.0.3           jsonlite_1.7.1         crosstalk_1.1.0.1     
## [37] BiocParallel_1.24.1    gtools_3.8.2           RCurl_1.98-1.2        
## [40] magrittr_2.0.1         GenomeInfoDbData_1.2.4 Matrix_1.2-18         
## [43] Rcpp_1.0.5             munsell_0.5.0          lifecycle_0.2.0       
## [46] stringi_1.5.3          yaml_2.2.1             zlibbioc_1.36.0       
## [49] blob_1.2.1             crayon_1.3.4           splines_4.0.3         
## [52] annotate_1.68.0        locfit_1.5-9.4         knitr_1.30            
## [55] pillar_1.4.7           geneplotter_1.68.0     XML_3.99-0.5          
## [58] glue_1.4.2             evaluate_0.14          latticeExtra_0.6-29   
## [61] data.table_1.13.4      vctrs_0.3.5            png_0.1-7             
## [64] testthat_3.0.0         gtable_0.3.0           purrr_0.3.4           
## [67] tidyr_1.1.2            xfun_0.19              xtable_1.8-4          
## [70] survival_3.2-7         viridisLite_0.3.0      AnnotationDbi_1.52.0  
## [73] memoise_1.1.0          ellipsis_0.3.1